library(haven)
library(data.table)
library(stringr)
library(dplyr)
library(tidyr)
library(tidyverse)
library(BTM)
library(udpipe)
library(stopwords)

# set path
rm(list = ls())
setwd(".../replication kit/intermediate files and code")

# load raw data on tweets and posts
data <- read_dta("fb tw micro data.dta", col_select = c(id, date, text))

# pick relevant time frame
data$date <- as.Date(data$date, "%Y-%m-%d")
data <- data[data$date >= "2016-02-16" & data$date <= "2016-04-16",]
data <- as.data.table(data)

# create biterms
dl <- udpipe_download_model(language = "german")

data <- rename(data, doc_id = id)

anno <- udpipe(data, "german", trace = 1000, parallel.cores = 5)

biterms <- as.data.table(anno)
biterms <- biterms[, cooccurrence(x = lemma,
                                  relevant = upos %in% c("NOUN", "ADJ", "VERB") & 
                                             nchar(lemma) > 2 & !lemma %in% stopwords("de"),
                                  skipgram = 3),
                   by = list(doc_id)]

set.seed(123456)
traindata <- subset(anno, upos %in% c("NOUN", "ADJ", "VERB") & !lemma %in% stopwords("de") & nchar(lemma) > 2)
traindata <- traindata[, c("doc_id", "lemma")]


model5     <- BTM(traindata, biterms = biterms, k = 5, iter = 500, trace = 100)
fit5 <- logLik(model5)
fit5 <- fit5$ll

model10     <- BTM(traindata, biterms = biterms, k = 10, iter = 500, trace = 100)
fit10 <- logLik(model10)
fit10 <- fit10$ll

model15     <- BTM(traindata, biterms = biterms, k = 15, iter = 500, trace = 100)
fit15 <- logLik(model15)
fit15 <- fit15$ll

model20     <- BTM(traindata, biterms = biterms, k = 20, iter = 500, trace = 100)
fit20 <- logLik(model20)
fit20 <- fit20$ll

model25     <- BTM(traindata, biterms = biterms, k = 25, iter = 500, trace = 100)
fit25 <- logLik(model25)
fit25 <- fit25$ll

model30     <- BTM(traindata, biterms = biterms, k = 30, iter = 500, trace = 100)
fit30 <- logLik(model30)
fit30 <- fit30$ll

model40     <- BTM(traindata, biterms = biterms, k = 40, iter = 500, trace = 100)
fit40 <- logLik(model40)
fit40 <- fit40$ll

model50     <- BTM(traindata, biterms = biterms, k = 50, iter = 500, trace = 100)
fit50 <- logLik(model50)
fit50 <- fit50$ll

model100     <- BTM(traindata, biterms = biterms, k = 100, iter = 500, trace = 100)
fit100 <- logLik(model100)
fit100 <- fit100$ll

#### combine log-likelihoods
all_fit = rbind(fit5, fit10, fit15, fit20, fit25, fit30, fit40, fit50, fit100)

write.table(all_fit, "ll_compare_fit.csv", sep = ",")



#### final model
model18 <- BTM(traindata, biterms = biterms, k = 18, iter = 500, trace = 100)
terms18 <- as.data.frame(terms(model18, top_n = 50))
write.table(terms18, "top_terms18.csv", sep = ",", col.names=NA)
scores18 <- predict(model18, newdata = anno)
scores18 <- as.data.frame(scores18)
write.table(scores18, "topics_by_item18.csv", sep = ",")










